by César Pérez
In this article, I'll present an example to scan and extract information from our file system using recursive functions. We'll define our goal as to:
This funtion will scan our file system by using os.list() to extract the content of a target folder and recursing when finding a subdirectory. For the purposes of this article, I will exclude everything on my .git folder.
import os
import pandas as pd
import re
from treelib import Node, Tree
from operator import itemgetter
#creates the file system map
def file_scan(root_dir):
current_path = os.listdir(root_dir)
file_count = 0
file_list = []
dir_count = 0
dir_dict = {}
for element in current_path:
next_path = os.path.join(root_dir, element)
if os.path.isdir(next_path) and element != '.git':
dir_count += 1
dir_dict[element] = file_scan(next_path)
elif element != '.git':
file_count += 1
file_list.append({'name':element, 'size': os.stat(next_path).st_size})
dir_dict['total_files'] = file_count
dir_dict['total_directories'] = dir_count
dir_dict['files'] = file_list
return dir_dict
def create_tree(dir_dict):
tree = Tree()
tree.create_node("*", "root")
for element in dir_dict:
if type(dir_dict[element]) == dict:
tree.create_node(element, element, parent="root")
create_node(dir_dict[element], element, tree)
elif type(dir_dict[element]) == list:
for list_element in dir_dict[element]:
tree.create_node(list_element['name'], list_element['name'], parent="root")
return tree
#Fuction complementary to create_tree()
def create_node(child_dict,level, tree):
for element in child_dict:
if type(child_dict[element]) == dict:
tree.create_node(element, element, parent=level)
create_node(child_dict[element], element, tree)
elif type(child_dict[element]) == list:
for list_element in child_dict[element]:
tree.create_node(list_element['name'], list_element['name'], parent=level)
return
These funtions will create two outputs:
def creates_summary(dir_dict):
summary_df = pd.DataFrame()
summary_count = {}
summary_size = {}
pie_chart_data = []
for element in dir_dict:
if type(dir_dict[element]) == dict:
append_summary(dir_dict[element], summary_count, summary_size)
elif type(dir_dict[element]) == list:
for list_element in dir_dict[element]:
file_ext = re.search('[\w ]*\.([\w]*)', list_element['name']).group(1)
summary_count[file_ext] = summary_count.get(file_ext,0) + 1
summary_size[file_ext] = summary_size.get(file_ext,0) + (list_element['size']/1000) #Represents KB
for element in summary_count:
summary_df.loc[element, 'count'] = summary_count[element]
summary_df.loc[element, 'size_KB'] = round(summary_size[element],2)
pie_chart_data.append([element, summary_size[element]])
pie_chart_data = sorted(pie_chart_data, key=itemgetter(1), reverse=True)
pie_chart_data.insert(0, ['File type', 'Size'])
summary_df['size_%'] = round((summary_df['size_KB'].astype(float) / summary_df['size_KB'].sum()) *100,2)
summary_df = summary_df.sort_values(by=['size_%'], ascending=False)
return summary_df.to_html(justify = 'center'), pie_chart_data
#Fuction complementary to creates_summary()
def append_summary(child_dict, summary_count, summary_size):
for element in child_dict:
if type(child_dict[element]) == dict:
append_summary(child_dict[element], summary_count, summary_size)
elif type(child_dict[element]) == list:
for list_element in child_dict[element]:
file_ext = re.search('[\w ]*\.([\w]*)', list_element['name']).group(1)
summary_count[file_ext] = summary_count.get(file_ext,0) + 1
summary_size[file_ext] = summary_size.get(file_ext,0) + (list_element['size']/1000) #Represents KB
return
Now that our functions have been declared, its time to execute them to gather the data about our file system we want to explore and share. Each of the following three cells will print the final outcome so we have a better understanding of what will be sending to the html report.
#1. gets notebook dir
dirname = os.getcwd()
#2. Creates main dict to create objects
main_dict = file_scan(os.path.join(dirname, '..//..'))
main_dict
{'total_files': 4, 'total_directories': 7, 'files': [{'name': 'index.html', 'size': 9907}, {'name': 'jupyter.html', 'size': 4597}, {'name': 'README.md', 'size': 44}, {'name': 'rmarkdown.html', 'size': 8420}], 'certificates': {'total_files': 8, 'total_directories': 0, 'files': [{'name': 'Data_Analyst_Carso.pdf', 'size': 30610}, {'name': 'Data_Mining_Carso.pdf', 'size': 30501}, {'name': 'Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera.pdf', 'size': 326958}, {'name': 'Google_Data_Analytics_Program_Coursera.pdf', 'size': 343510}, {'name': 'Google_IT_Automation_with_Python_Program_Coursera.pdf', 'size': 344731}, {'name': 'Probabilidad_y_Estadistica_Coursera.pdf', 'size': 401904}, {'name': 'Python_for_Everyone_Coursera.pdf', 'size': 330069}, {'name': 'Web_Design_for_Everyone_Coursera.pdf', 'size': 348725}]}, 'css': {'total_files': 1, 'total_directories': 0, 'files': [{'name': 'style.css', 'size': 4871}]}, 'cv': {'total_files': 2, 'total_directories': 0, 'files': [{'name': 'CV_Cesar_Perez_Eng.pdf', 'size': 139929}, {'name': 'CV_Cesar_Perez_Esp.pdf', 'size': 140741}]}, 'img': {'total_files': 7, 'total_directories': 1, 'files': [{'name': 'Montreal.jpg', 'size': 239713}, {'name': 'MyPic.jpg', 'size': 575484}, {'name': 'real_monte1.jpg', 'size': 153924}, {'name': 'real_monte2.jpg', 'size': 1001247}, {'name': 'real_monte4.jpeg', 'size': 186387}, {'name': 'Seoul.jpg', 'size': 578631}, {'name': 'view_home1.jpg', 'size': 925442}], 'thumbnail': {'total_files': 8, 'total_directories': 0, 'files': [{'name': 'Data_Analyst_Carso_thumbnail.png', 'size': 94840}, {'name': 'Data_Mining_Carso_thumbnail.png', 'size': 76513}, {'name': 'Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera_thumbnail.png', 'size': 353725}, {'name': 'Google_Data_Analytics_Program_Coursera_thumbnail.png', 'size': 347224}, {'name': 'Google_IT_Automation_with_Python_Program_Coursera_thumbnail.png', 'size': 368312}, {'name': 'Probabilidad_y_Estadistica_Coursera_thumbnail.png', 'size': 310200}, {'name': 'Python_for_Everyone_Coursera_thumbnail.png', 'size': 467097}, {'name': 'Web_Design_for_Everyone_Coursera_thumbnail.png', 'size': 491931}]}}, 'js': {'total_files': 1, 'total_directories': 0, 'files': [{'name': 'lib.js', 'size': 3698}]}, 'jupyter': {'FileScan': {'total_files': 4, 'total_directories': 0, 'files': [{'name': 'FileSystem_Scan.html', 'size': 639501}, {'name': 'FileSystem_Scan.ipynb', 'size': 11680}, {'name': 'report.html', 'size': 7588}, {'name': 'template.html', 'size': 3097}]}, 'total_files': 0, 'total_directories': 2, 'files': [], 'SqlStressTesting': {'total_files': 6, 'total_directories': 0, 'files': [{'name': 'report_MSSQL.csv', 'size': 405444}, {'name': 'report_SQLite.csv', 'size': 453841}, {'name': 'SQL_stress_testing.html', 'size': 787137}, {'name': 'SQL_stress_testing.ipynb', 'size': 184085}, {'name': 'testMSSQL.py', 'size': 2125}, {'name': 'testSQLite.py', 'size': 1655}]}}, 'rmarkdown': {'total_files': 4, 'total_directories': 0, 'files': [{'name': 'Bicycle-Trip-Analysis-R.html', 'size': 1388017}, {'name': 'Dating-app-review-analysis-R.html', 'size': 1493439}, {'name': 'Fitness-Device-Usage-Analysis-R.html', 'size': 1597277}, {'name': 'hotel-booking-analysis-R.html', 'size': 1853495}]}}
#3. File tree
tree_filename = os.path.join(dirname, 'tree.txt')
file_tree = create_tree(main_dict)
file_tree.save2file(tree_filename)
tree_txt_reader = open(tree_filename, 'r', encoding='utf-8')
tree_txt = ""
for line in tree_txt_reader.readlines():
line = '<pre>'+line+'</pre>'
tree_txt += line
tree_txt_reader.close()
os.remove(tree_filename)
tree_txt
'<pre>*\n</pre><pre>├── README.md\n</pre><pre>├── certificates\n</pre><pre>│ ├── Data_Analyst_Carso.pdf\n</pre><pre>│ ├── Data_Mining_Carso.pdf\n</pre><pre>│ ├── Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera.pdf\n</pre><pre>│ ├── Google_Data_Analytics_Program_Coursera.pdf\n</pre><pre>│ ├── Google_IT_Automation_with_Python_Program_Coursera.pdf\n</pre><pre>│ ├── Probabilidad_y_Estadistica_Coursera.pdf\n</pre><pre>│ ├── Python_for_Everyone_Coursera.pdf\n</pre><pre>│ └── Web_Design_for_Everyone_Coursera.pdf\n</pre><pre>├── css\n</pre><pre>│ └── style.css\n</pre><pre>├── cv\n</pre><pre>│ ├── CV_Cesar_Perez_Eng.pdf\n</pre><pre>│ └── CV_Cesar_Perez_Esp.pdf\n</pre><pre>├── img\n</pre><pre>│ ├── Montreal.jpg\n</pre><pre>│ ├── MyPic.jpg\n</pre><pre>│ ├── Seoul.jpg\n</pre><pre>│ ├── real_monte1.jpg\n</pre><pre>│ ├── real_monte2.jpg\n</pre><pre>│ ├── real_monte4.jpeg\n</pre><pre>│ ├── thumbnail\n</pre><pre>│ │ ├── Data_Analyst_Carso_thumbnail.png\n</pre><pre>│ │ ├── Data_Mining_Carso_thumbnail.png\n</pre><pre>│ │ ├── Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera_thumbnail.png\n</pre><pre>│ │ ├── Google_Data_Analytics_Program_Coursera_thumbnail.png\n</pre><pre>│ │ ├── Google_IT_Automation_with_Python_Program_Coursera_thumbnail.png\n</pre><pre>│ │ ├── Probabilidad_y_Estadistica_Coursera_thumbnail.png\n</pre><pre>│ │ ├── Python_for_Everyone_Coursera_thumbnail.png\n</pre><pre>│ │ └── Web_Design_for_Everyone_Coursera_thumbnail.png\n</pre><pre>│ └── view_home1.jpg\n</pre><pre>├── index.html\n</pre><pre>├── js\n</pre><pre>│ └── lib.js\n</pre><pre>├── jupyter\n</pre><pre>│ ├── FileScan\n</pre><pre>│ │ ├── FileSystem_Scan.html\n</pre><pre>│ │ ├── FileSystem_Scan.ipynb\n</pre><pre>│ │ ├── report.html\n</pre><pre>│ │ └── template.html\n</pre><pre>│ └── SqlStressTesting\n</pre><pre>│ ├── SQL_stress_testing.html\n</pre><pre>│ ├── SQL_stress_testing.ipynb\n</pre><pre>│ ├── report_MSSQL.csv\n</pre><pre>│ ├── report_SQLite.csv\n</pre><pre>│ ├── testMSSQL.py\n</pre><pre>│ └── testSQLite.py\n</pre><pre>├── jupyter.html\n</pre><pre>├── rmarkdown\n</pre><pre>│ ├── Bicycle-Trip-Analysis-R.html\n</pre><pre>│ ├── Dating-app-review-analysis-R.html\n</pre><pre>│ ├── Fitness-Device-Usage-Analysis-R.html\n</pre><pre>│ └── hotel-booking-analysis-R.html\n</pre><pre>└── rmarkdown.html\n</pre>'
#4. Pie Chart and table
table_summary, pie_chart = creates_summary(main_dict)
table_summary
'<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: center;">\n <th></th>\n <th>count</th>\n <th>size_KB</th>\n <th>size_%</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>html</th>\n <td>11.0</td>\n <td>7792.48</td>\n <td>44.61</td>\n </tr>\n <tr>\n <th>jpg</th>\n <td>6.0</td>\n <td>3474.44</td>\n <td>19.89</td>\n </tr>\n <tr>\n <th>png</th>\n <td>8.0</td>\n <td>2509.84</td>\n <td>14.37</td>\n </tr>\n <tr>\n <th>pdf</th>\n <td>10.0</td>\n <td>2437.68</td>\n <td>13.95</td>\n </tr>\n <tr>\n <th>csv</th>\n <td>2.0</td>\n <td>859.29</td>\n <td>4.92</td>\n </tr>\n <tr>\n <th>ipynb</th>\n <td>2.0</td>\n <td>195.77</td>\n <td>1.12</td>\n </tr>\n <tr>\n <th>jpeg</th>\n <td>1.0</td>\n <td>186.39</td>\n <td>1.07</td>\n </tr>\n <tr>\n <th>css</th>\n <td>1.0</td>\n <td>4.87</td>\n <td>0.03</td>\n </tr>\n <tr>\n <th>js</th>\n <td>1.0</td>\n <td>3.70</td>\n <td>0.02</td>\n </tr>\n <tr>\n <th>py</th>\n <td>2.0</td>\n <td>3.78</td>\n <td>0.02</td>\n </tr>\n <tr>\n <th>md</th>\n <td>1.0</td>\n <td>0.04</td>\n <td>0.00</td>\n </tr>\n </tbody>\n</table>'
pie_chart
[['File type', 'Size'], ['html', 7792.475], ['jpg', 3474.441], ['png', 2509.842], ['pdf', 2437.678], ['csv', 859.2850000000001], ['ipynb', 195.76500000000001], ['jpeg', 186.387], ['css', 4.871], ['py', 3.7800000000000002], ['js', 3.698], ['md', 0.044]]
For the purposes of this article, I've decided to use a simple approach. I've created an empty template, then python will read this file, and insert the system data by using placeholders. Finally, the result is written into a different file.
# #5. Create report
template_filename = os.path.join(dirname, 'template.html')
template_reader = open(template_filename, 'r')
template_content = template_reader.read()
template_reader.close()
new_content = re.sub("pie_chart_data_goes_here", str(pie_chart), template_content)
new_content = re.sub("table_goes_here", str(table_summary), new_content)
new_content = re.sub("file_tree_goes_here", tree_txt, new_content)
report_filename = os.path.join(dirname, 'report.html')
template_writer = open(report_filename, 'w', encoding="utf-8")
template_writer.write(new_content)
6909
The dashboard, at the point of uploading this notebook is quite simple, some aspects to keep in mind: